Main Figures
Fig 1b: Proportion z-test for eQTLs associations with a single vs multiple spatial evidence
library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'readr' was built under R version 3.3.2
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
spatial.df <- data.frame(Category=c("spatial.pairs","eQTL.associations"),
One=c(686134,2575),
Many=c(496903,73438),
Total=c(1183037,76013))
spatial.df <- gather(spatial.df, hic.loops, value,One:Many)
spatial.df$percentage <- spatial.df$value / spatial.df$Total * 100
prop.test(x=subset(spatial.df, Category=="eQTL.associations")$value,
n=subset(spatial.df, Category=="spatial.pairs")$value,
alternative="less")
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: subset(spatial.df, Category == "eQTL.associations")$value out of subset(spatial.df, Category == "spatial.pairs")$value
## X-squared = 99444, df = 1, p-value < 2.2e-16
## alternative hypothesis: less
## 95 percent confidence interval:
## -1.0000000 -0.1431998
## sample estimates:
## prop 1 prop 2
## 0.003752911 0.147791420
Fig 1b: Plot eQTL associations with a single vs multiple spatial evidence
ggplot(spatial.df,
aes(factor(Category, levels=c("spatial.pairs","eQTL.associations"))))+
geom_bar(aes(weight=percentage, fill=factor(hic.loops, levels=c("One","Many"))), position=position_dodge())+
theme_classic()+
scale_y_continuous("Percentage", limits=c(0,100), expand=c(0,0))+
scale_x_discrete("", labels=c("SNP-gene pairs", "eQTL associations"))+
scale_fill_brewer("Captured interactions", type="qual", palette = 7)#+
# ggsave("analysis/review_hic/hic_loops.pdf", width=6, height=4)
Fig 1c: Plot GWAS-Matched vs Novel eQTL-eGene mapping
mapping.df <- data.frame(
Mapping = rep(c("CoDeS3D","GWAS & CoDeS3D"),each=2),
Type = rep(c("Unique","Total"),2),
Proportion = c(0.815, 0.757, 0.185, 0.243))
ggplot(mapping.df, aes(x=Type,y=Proportion))+
geom_bar(aes(fill=Mapping),stat = "identity")+#,position = position_dodge())+#, width = 0.8)+
theme_classic()+
labs(fill="SNP-gene mapping")+
scale_y_continuous("Proportion", expand = c(0,0), limits = c(0,1))+
scale_x_discrete("",
label=c("eQTL-eGene pairs","Tissue associations"))#+
# ggsave("interactions_summary.eps", width = 6, height = 4, dpi=300)
Fig 1e: eQTL-eGene fragment distance
library(tidyverse)
gtex.df <- read.delim('analysis/chr22/gene_fragments/fragment_gtex_only.txt', sep='\t', header=FALSE)
gtex.df$method <- 'GTEx'
hic_gtex.df <- read.delim('analysis/chr22/gene_fragments/fragment_sighic_gtex.txt', sep='\t', header=FALSE)
hic_gtex.df$method <- 'HiC+GTEx'
hic.df <- read.delim('analysis/chr22/gene_fragments/fragment_sighic.txt', sep='\t', header=FALSE)
hic.df$method <- 'HiC'
chr22.df <- rbind(gtex.df,hic_gtex.df,hic.df)
chr22.df$Label <- ifelse(abs(chr22.df$V15) < 1000000, "<1Mb", ">=1Mb")
ggplot(chr22.df, aes(abs(V15)))+
geom_freqpoly(aes(color=chr22.df$method), bins=100)+
theme_classic()+
scale_x_continuous('eQTL SNP-eGene fragment distance (Mb)')+
scale_color_manual('Method',
values=c("#989936", "#016699", "#cd6632"),
labels=c("GTEx only", "CoDeS3D only","CoDeS3D + GTEx"))+
facet_grid(. ~Label, scales="free")#+
# ggsave('fragment_distances_facet.eps', width=5, height=4)
Fig 1f: Boxplot of eQTL effect sizes of methods
gtex.df <- read.delim('analysis/chr22/gene_fragments/gtex_only.txt', header=FALSE)
gtex.df$method <- 'GTEx'
hic_gtex.df <- read.delim('analysis/chr22/gene_fragments/sighic_gtex.txt', header=FALSE)
hic_gtex.df$method <- 'HiC+GTEx'
hic_only.df <- read.delim('analysis/chr22/gene_fragments/sighic.txt', header=FALSE)
hic_only.df$method <- 'HiC'
chr22.df <- rbind(gtex.df,hic_gtex.df,hic_only.df)
ggplot(chr22.df, aes(y=V7))+
geom_boxplot(aes(color=factor(method, levels=c("GTEx","HiC+GTEx","HiC"))), notch=TRUE)+
scale_y_continuous("eQTL effect size", limits=c(-2,2), expand=c(0,0))+
scale_x_discrete("")+
scale_color_manual('Method',
values=c("#989936", "#cd6632","#016699"),
labels=c("GTEx only","CoDeS3D + GTEx", "CoDeS3D only"))
Fig1f: T-tests between groups
#GTEx only vs CoDeS3D+GTEx
t.test(gtex.df$V7, hic_gtex.df$V7, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: gtex.df$V7 and hic_gtex.df$V7
## t = 2.5075, df = 1120.2, p-value = 0.0123
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.01256972 0.10300768
## sample estimates:
## mean of x mean of y
## 0.054603440 -0.003185255
#GTEx only vs CoDeS3D only
t.test(gtex.df$V7, hic_only.df$V7, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: gtex.df$V7 and hic_only.df$V7
## t = 2.9401, df = 1301.7, p-value = 0.003339
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.0237212 0.1188609
## sample estimates:
## mean of x mean of y
## 0.05460344 -0.01668762
#CoDeS3D+GTEx vs CoDeS3D only
t.test(hic_gtex.df$V7, hic_only.df$V7, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: hic_gtex.df$V7 and hic_only.df$V7
## t = 0.98499, df = 2181.2, p-value = 0.3247
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.01337994 0.04038466
## sample estimates:
## mean of x mean of y
## -0.003185255 -0.016687617
Figure 4: eQTL effect on FADS1 and FADS2 in the PUFA metabolism cluster
fat.df <- read.delim('analysis/fads_eqtls.txt', sep='\t', header=TRUE)
snps.omit <- c('rs174479') # rs174479 lies in RAB3IL1, which is outside of the locus we're looking at.
fat.df <- subset(fat.df, !(SNP.Id%in%snps.omit))
fads1 <- subset(fat.df, Gene.Symbol=='FADS1' || Gene.Symbol=='FADS2')
fads2 <- subset(fat.df, Gene.Symbol=='FADS2')
snps <- c('rs174528', 'rs174529', 'rs108499', 'rs509360', 'rs174534',
'rs174535', 'rs174536', 'rs174537', 'rs102275', 'rs174538',
'rs4246215', 'rs174541', 'rs174545','rs174546', 'rs174547',
'rs174548', 'rs174549', 'rs174550', 'rs174554', 'rs174555',
'rs174556', 'rs968567', 'rs174570', 'rs1535', 'rs174574',
'rs2845573', 'rs174575', 'rs2727270', 'rs2727271', 'rs174576',
'rs2524299', 'rs174577', 'rs2072114', 'rs174578', 'rs174583', 'rs2851682',
'rs174601', 'rs2526678', 'rs422249', 'rs174448', 'rs174449', 'rs1000778')
tissues <- c("Adipose - Subcutaneous", "Adipose - Visceral (Omentum)",
"Artery - Aorta", "Artery - Tibial", "Brain - Anterior cingulate cortex (BA24)",
"Brain - Cerebellar Hemisphere", "Brain - Cerebellum", "Brain - Cortex",
"Brain - Frontal Cortex (BA9)", "Brain - Nucleus accumbens (basal ganglia)",
"Brain - Putamen (basal ganglia)", "Breast - Mammary Tissue",
"Cells - EBV-transformed lymphocytes", "Cells - Transformed fibroblasts",
"Colon - Sigmoid", "Colon - Transverse", "Esophagus - Gastroesophageal Junction",
"Esophagus - Mucosa", "Esophagus - Muscularis", "Heart - Atrial Appendage",
"Heart - Left Ventricle", "Liver", "Lung", "Muscle - Skeletal", "Nerve - Tibial",
"Pancreas", "Skin - Not Sun Exposed (Suprapubic)", "Skin - Sun Exposed (Lower leg)",
"Small Intestine - Terminal Ileum", "Spleen","Stomach", "Testis", "Thyroid",
"Uterus", "Whole Blood")
fads.genes <- c('FADS1', 'FADS2')
data <- setNames(data.frame(matrix(ncol = 3,
nrow = length(tissues)*length(snps)*length(fads.genes))
), c('SNP.Id', 'Gene.Symbol', 'Tissue'))
i <- 1
for(t in tissues){
for(s in snps){
for(g in fads.genes){
data[i,] <- c(as.character(s), as.character(g), as.character(t))
i <- i+1
}
}
}
merged <- left_join(data, fads1)
## Joining, by = c("SNP.Id", "Gene.Symbol", "Tissue")
## Warning: Column `SNP.Id` joining character vector and factor, coercing into
## character vector
## Warning: Column `Gene.Symbol` joining character vector and factor, coercing
## into character vector
## Warning: Column `Tissue` joining character vector and factor, coercing into
## character vector
tissue.test <- function(data){
ifelse(data$SNP==fads2$SNP.Id && data$Tissue==fads2$Tissue && data$Gene==fads2$Gene.Symbol,
c(fads2$SNP.Id, fads2$Gene.Symbol, tissue, fads2$Effect.Size),
c(fads2$SNP.Id, fads2$Gene.Symbol, tissue,'NA'))
}
ggplot(merged, aes(y=factor(SNP.Id, levels=snps), x=factor(Tissue, levels=tissues)))+
geom_tile(aes(fill=Effect.Size))+
scale_x_discrete('Tissues')+
scale_y_discrete('eQTL SNPs')+
scale_fill_gradientn('Effect Size', colors=terrain.colors(10),
na.value = 'transparent',
#low='#d7191c', mid='#ffffbf', high='#2b83ba',
limits=c(-1.0,1.0))+
theme(axis.text.x = element_text(angle=90,hjust=1,vjust=0.01))+
facet_grid(. ~ Gene.Symbol)#+
Fig 5a and b data
hic.data <- read.delim("analysis/hic_all_celltype.txt", header = T)
hic.data <- subset(hic.data, hic.data$Cell != '')
tissues <- levels(hic.data$Tissue)
Fig 5a: Heatmap of eQTL p-values in Cells
new.data <- data.frame()
for(t in tissues){
a <- subset(hic.data, Tissue==t)
a1 <- subset(a, Tissue==t & Cell=='GM12878')
a1$PNorm <- mean(a1$Pval)
a2 <- subset(a, Tissue==t & Cell=='HMEC')
a2$PNorm <- mean(a2$Pval)
a3 <- subset(a, Tissue==t & Cell=='HUVEC')
a3$PNorm <- mean(a3$Pval)
a4 <- subset(a, Tissue==t & Cell=='IMR90')
a4$PNorm <- mean(a4$Pval)
a5 <- subset(a, Tissue==t & Cell=='K562')
a5$PNorm <- mean(a5$Pval)
a6 <- subset(a, Tissue==t & Cell=='KBM7')
a6$PNorm <- mean(a6$Pval)
a7 <- subset(a, Tissue==t & Cell=='NHEK')
a7$PNorm <- mean(a7$Pval)
a <- rbind(a1,a2,a3,a4,a5,a6,a7)
a$PNorm <- 1 - (((a$PNorm - min(a$PNorm))/max(a$PNorm))* max(a$PNorm) / (max(a$PNorm) - min(a$PNorm)))
new.data <- rbind(new.data, a)
}
ggplot(new.data, aes(x=new.data$Cell, y=new.data$Tissue))+
geom_tile(aes(fill=new.data$PNorm))+
scale_fill_gradient2(expression(P[eQTL]),mid='#ffffbf', low='#ffffff', high='#053061',
breaks=c(0,1),labels=c('Max', 'Min'), limits=c(0,1))+
scale_y_discrete(expand = c(0,0), limits=rev(levels(new.data$Tissue)))+
scale_x_discrete(expand = c(0,0))+
labs(x='HiC cell lines', y='GTEx tissues')+
theme(legend.position = 'top',
axis.text.y =element_text(''))#+
# ggsave("hic_tissue_pval.pdf", width = 8, height = 10, dpi=300)
Fig 5b: Heatmap of Hi-C contacts in Cells
contacts.df <- data.frame()
for(t in tissues){
a <- subset(hic.data, Tissue==t)
a1 <- subset(a, Tissue==t & Cell=='GM12878')
a1$CNorm <- mean(a1$HiC)
a2 <- subset(a, Tissue==t & Cell=='HMEC')
a2$CNorm <- mean(a2$HiC)
a3 <- subset(a, Tissue==t & Cell=='HUVEC')
a3$CNorm <- mean(a3$HiC)
a4 <- subset(a, Tissue==t & Cell=='IMR90')
a4$CNorm <- mean(a4$HiC)
a5 <- subset(a, Tissue==t & Cell=='K562')
a5$CNorm <- mean(a5$HiC)
a6 <- subset(a, Tissue==t & Cell=='KBM7')
a6$CNorm <- mean(a6$HiC)
a7 <- subset(a, Tissue==t & Cell=='NHEK')
a7$CNorm <- mean(a7$HiC)
a <- rbind(a1,a2,a3,a4,a5,a6,a7)
a$CNorm <- 1 - (((a$CNorm - min(a$CNorm))/max(a$CNorm))* max(a$CNorm) / (max(a$CNorm) - min(a$CNorm)))
contacts.df <- rbind(contacts.df, a)
}
ggplot(contacts.df, aes(x=contacts.df$Cell, y=contacts.df$Tissue))+
geom_raster(aes(fill=contacts.df$CNorm))+
scale_fill_gradient2('Hic contacts',mid='#ffffbf', low='#ffffff', high='#053061',
breaks=c(0,1),labels=c('Min', 'Max'), limits=c(0,1))+
scale_y_discrete(expand = c(0,0), limits=rev(levels(contacts.df$Tissue)))+
scale_x_discrete(expand = c(0,0))+
labs(x='HiC cell lines', y='GTEx tissues')+
theme(legend.position = 'top',
axis.text.y =element_text(''))
# ggsave("hic_tissue_counts.pdf", width = 8, height = 10, dpi=300)
Fig 6: OMIM Analysis
library(VennDiagram)
## Loading required package: grid
## Loading required package: futile.logger
grid.draw(VennDiagram::draw.pairwise.venn(16313, 7917, 5069, c('OMIM','Spatial'),
fill=c('#A6CEE3','#1F78B4'), cex=1.5 ,scaled = T))
df <- data.frame(mapping=c('MappedGenes','MappedGenes','UnknownDefect','UnknownDefect',
'Linkage','Linkage', 'GeneMutation','GeneMutation','DupDel','DupDel'),
value=c(0.311,0.295,0.013,0.012,0.154,0.006,0.813,0.982,0.02,0.0004),
dataset=c('OMIM','Spatial','OMIM','Spatial','OMIM','Spatial','OMIM','Spatial','OMIM','Spatial'))
ggplot(df,aes(df$mapping,df$value,fill=df$dataset))+
geom_bar(stat = 'identity',position = position_dodge(0.9))+
theme_classic()+
scale_y_continuous(expand = c(0,0), limits = c(0,1))+
scale_fill_brewer(palette = 'Paired',guide=guide_legend(title='Dataset'))+
scale_x_discrete(limits=c('MappedGenes','UnknownDefect','Linkage','GeneMutation','DupDel'),
label=c('Mapped Genes','Unknown Defect','Linkage','Mutation','Duplication'))+
labs(x='', y='Proportion')#+
# ggsave("omim_bar.pdf", width=8, height=7)
# grid.newpage();
# pdf("venn.pdf")
# v <- draw.pairwise.venn(16313, 7917, 5069, c('OMIM','Spatial'), fill=c('#A6CEE3','#1F78B4'), cex=1.5 ,scaled = T);
# grid.draw(v);
# dev.off()